## load relevant libraries and functions
require(knitr) # for knitting
library(Hmisc) # for descriptives
library(png) # for working with images
library(grid)
library(DT) # for data tables
library(tidyverse) # for everything else
library(ggeffects) # for regression outputs
library(broom.mixed)
library(emmeans)
library(generics)
library(broom)
library(modelr)
library(lme4) # for mixed effects models
library(ltm) # for IRT stuff
library(TAM)
library(psych)
library(ggplotify) # for plotting
library(patchwork)
## set default code chunk options
knitr::opts_chunk$set(echo = T, warning = F, message = F)
## set default plot theme and colors
theme_set(theme_classic(base_size = 18))
## fix print width for knitted doc
options(width = 70)
## suppress warnings about grouping
options(dplyr.summarise.inform = F)
options(xtable.floating = F)
options(xtable.timestamp = "")
## set random seed
set.seed(1)
## set directories for plots and data outputs
figures_dir = '../figures/'
data_dir = '../data/'# Import data as downloaded from https://data-visualization-benchmark.s3.us-west-2.amazonaws.com/vt-fusion/psych_139_mini_project_1_split_80_responses.csv
df.raw =
read.csv(paste0(data_dir,
"psych_139_mini_project_1_split_80_responses.csv"))
# run quick data summary
skimr::skim(df.raw)| Name | df.raw |
| Number of rows | 15616 |
| Number of columns | 15 |
| _______________________ | |
| Column type frequency: | |
| character | 12 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| participant_id | 0 | 1 | 24 | 24 | 0 | 426 | 0 |
| question_text | 0 | 1 | 30 | 375 | 0 | 150 | 0 |
| image_url | 0 | 1 | 83 | 92 | 0 | 163 | 0 |
| test_name | 0 | 1 | 3 | 5 | 0 | 5 | 0 |
| graph_type | 0 | 1 | 3 | 16 | 0 | 72 | 0 |
| task_category | 0 | 1 | 3 | 25 | 0 | 26 | 0 |
| possible_responses | 0 | 1 | 13 | 327 | 0 | 124 | 0 |
| table_presentation_format | 0 | 1 | 4 | 5 | 0 | 2 | 0 |
| participant_response | 0 | 1 | 0 | 112 | 356 | 403 | 0 |
| task_type_merged | 0 | 1 | 20 | 22 | 0 | 3 | 0 |
| correct_answer | 0 | 1 | 1 | 112 | 0 | 122 | 0 |
| misleading_item | 0 | 1 | 4 | 5 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| X | 0 | 1 | 9795.11 | 5651.72 | 82 | 4830.75 | 9649.5 | 14721.25 | 19526 | ▇▇▇▇▇ |
| item_id | 0 | 1 | 114.91 | 66.59 | 1 | 57.00 | 114.0 | 174.00 | 229 | ▇▇▇▇▇ |
| correct_response | 0 | 1 | 0.63 | 0.48 | 0 | 0.00 | 1.0 | 1.00 | 1 | ▅▁▁▁▇ |
# convert data to wide format so each row represents one participant (N = 426)
df.wide =
df.raw %>%
# grab response values for each question from "correct_response" col
pivot_wider(id_cols = "participant_id",
names_from = c("question_text"),
values_from = c("correct_response"),
values_fn = mean)
# print top rows
head(df.wide, n = 10)## # A tibble: 10 × 151
## participant_id Which month has 25 m…¹ How many months have…²
## <chr> <dbl> <dbl>
## 1 6631326877f343bf84e1… 1 1
## 2 66abd9806349818b3dab… 1 1
## 3 610898813e48dc286496… 0 NA
## 4 62e2deadd18974300540… 1 NA
## 5 63c6d6d5e132610bf009… 1 1
## 6 64136da866ff27f7bf98… 0 1
## 7 6737d3a1eea5bb6f29a9… 1 1
## 8 6159fb27b6340dda3fc6… 0 1
## 9 6760dca655eab7da158e… 1 1
## 10 677e75b259b7f06d0c32… 1 1
## # ℹ abbreviated names: ¹`Which month has 25 mm of rain?`,
## # ²`How many months have less than 40mm of rain?`
## # ℹ 148 more variables: `Which season has the most rain?` <dbl>,
## # `In which season does each month have less rain than the month before?` <dbl>,
## # `Which season has more rain, the summer or the spring?` <dbl>,
## # `In San Francisco, as the weather gets warmer, there is generally more rain.` <dbl>,
## # `How much does it rain in March?` <dbl>, …
Across all tests, we have a total of 184 items (150 questions), with about 80-90 responses per item.
# Create summary df: For each question, compute % of participants who responded correctly (mean), SD, and SE
df.wide_summary =
df.raw %>%
group_by(question_text, test_name, graph_type, task_type_merged) %>%
# compute mean
dplyr::summarise(mean_correct = mean(correct_response,
na.rm = T),
# compute SD
sd_correct = sd(correct_response,
na.rm = T),
# compute number of responses
n = n(),
# compute SE (SD / sqrt of n)
se_correct = sd_correct/sqrt(n)) %>%
ungroup()
# print top rows
head(df.wide_summary, n = 10)## # A tibble: 10 × 8
## question_text test_name graph_type task_type_merged mean_correct
## <chr> <chr> <chr> <chr> <dbl>
## 1 A group of male… VLAT Scatter statistical inf… 0.782
## 2 A group of the … VLAT Scatter statistical inf… 0.616
## 3 About how much … VLAT Line arithmetic comp… 0.784
## 4 About what is t… VLAT Pie value identific… 0.619
## 5 About what is t… VLAT Stacked B… value identific… 0.494
## 6 About what was … VLAT Stacked A… value identific… 0.273
## 7 According to yo… GGR Line statistical inf… 0.494
## 8 Approximately w… GGR Line value identific… 0.831
## 9 Approximately w… GGR Pie arithmetic comp… 0.671
## 10 Approximately, … CALVI Stacked A… arithmetic comp… 0.176
## # ℹ 3 more variables: sd_correct <dbl>, n <int>, se_correct <dbl>
Below, we evaluate means and SDs per test:
## [1] 0.6288262
## [1] 0.2738472
df.wide_summary %>%
group_by(test_name) %>%
summarise(mean = mean(mean_correct),
sd = sd(mean_correct),
min = min(mean_correct),
max = max(mean_correct))## # A tibble: 5 × 5
## test_name mean sd min max
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 BRBF 0.699 0.214 0.2 0.988
## 2 CALVI 0.435 0.285 0.0120 0.964
## 3 GGR 0.620 0.291 0.129 0.943
## 4 VLAT 0.657 0.259 0.0723 0.988
## 5 WAN 0.795 0.192 0.227 0.989
The plot below shows that as expected, even within a single test, the items varied considerably in their difficulty (as measured by % correct).
df.wide_summary %>%
# question on x-axis, ordered by % correct
ggplot(aes(x = reorder(question_text, mean_correct),
# % correct on y-axis
y = mean_correct*100),
# color each bar based on graph type
color = graph_type) +
# add CIs based on 1.96*SE
geom_errorbar(
aes(ymin = (mean_correct - (1.96*se_correct))*100,
ymax = (mean_correct + (1.96*se_correct))*100),
width = 2,
color = "black",
alpha = .7,
size = 1) +
# plot one point per question/item
geom_point(aes(fill = graph_type),
shape = 21, color = "black", size = 2) +
# facet by task type and test
facet_wrap(~ task_type_merged + test_name,
ncol = 5, scales = "free_y") +
# adjust x-axis label orientation for readability
theme(axis.text.x = element_text(angle = 45,
hjust = 1,
size = 1),
legend.position = "none") +
# fix scale to be 0-100, breaks of 25
scale_y_continuous(limits = c(0, 100),
breaks = seq(0, 100, 25)) +
# add title and axis labels
labs(title = "Individual Item Difficulty by Task Type & Test",
x = "Question",
y = "% correct")The plot below shows that even though different tests contained items of the same “task type”, their respective item difficulty still varied across tests.
# create new sub-dataframe: Within a given test, collapse across a given task type and compute % of participants who responded correctly (mean), SD, and SE
df.task_summary_tasktype =
df.raw %>%
# group by test and task type
group_by(test_name, task_type_merged) %>%
# compute mean
summarise(mean_correct = mean(correct_response, na.rm = T),
# compute SD
sd_correct = sd(correct_response, na.rm = T),
# compute number of responses
n = n(),
# compute SE
se_correct = sd_correct / sqrt(n))df.task_summary_tasktype %>%
# test type on x-axis
ggplot(aes(x = test_name,
# % correct on y-axis
y = (mean_correct*100),
# color by test name
fill = test_name,
color = test_name)) +
# add CIs based on 1.96*SE
geom_errorbar(aes(ymin = (mean_correct - (1.96*(1.96*se_correct)))*100,
ymax = (mean_correct + (1.96*se_correct))*100),
position = position_dodge(width = 0.9),
width = 0.1,
color = "black") +
# Add one point for each % correct collapsed across task type
geom_point(position = position_dodge(width = 0.9),
width = 1,
size = 3) +
# Adjust scales to be 0-100
scale_y_continuous(limits = c(0, 100),
breaks = seq(0, 100, 25),
labels = scales::percent_format(scale = 1)) +
# Add title and axis labels
labs(title = "% correct averaged across Task Type",
x = "Task Type",
y = "% correct",
fill = "Test") +
# Adjust theme
theme_minimal(base_size = 18) +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.position = "bottom") +
guides(fill = "none") +
# Facet by task type
facet_grid(~task_type_merged)In the output below, we see that on some items, as few as 1% of participants selected the correct response (very difficult items), and on other items, as many as 99% of participants selected the correct response (very easy items).
Across items, SDs of % correct ranged from 10.6% to 50%.
## Across dataset, compute means and SDs as well as mins and maxes for each respective statistic
df.wide_summary %>%
dplyr::select(mean_correct, sd_correct) %>%
summary()## mean_correct sd_correct
## Min. :0.01205 Min. :0.1066
## 1st Qu.:0.40964 1st Qu.:0.3258
## Median :0.70177 Median :0.4142
## Mean :0.62883 Mean :0.3870
## 3rd Qu.:0.85882 3rd Qu.:0.4802
## Max. :0.98864 Max. :0.5029
# Print 10 questions with lowest average % correct
df.wide_summary %>%
dplyr::select(question_text, mean_correct, sd_correct) %>%
arrange(desc(mean_correct)) %>%
tail(n = 10) %>%
print()## # A tibble: 10 × 3
## question_text mean_correct sd_correct
## <chr> <dbl> <dbl>
## 1 What was the number of girls named 'Amelia… 0.145 0.354
## 2 Assuming today is May 10 and given the cha… 0.141 0.350
## 3 Which of the treatments contributes to a l… 0.129 0.338
## 4 What is the range in weight for the 85 mal… 0.0833 0.278
## 5 Do U.S. states have a more similar number … 0.0814 0.275
## 6 What was the unemployment rate for Indiana… 0.0723 0.261
## 7 Are there more households with exactly 1 c… 0.0575 0.234
## 8 Assuming today is May 01, 2022, which of t… 0.0488 0.217
## 9 Does any members of species C weigh more t… 0.0233 0.152
## 10 Does cell phone brand A have more than hal… 0.0120 0.110
# Print 10 questions with highest average % correct
df.wide_summary %>%
dplyr::select(question_text, mean_correct, sd_correct) %>%
arrange(desc(mean_correct)) %>%
head(n = 10) %>%
print()## # A tibble: 10 × 3
## question_text mean_correct sd_correct
## <chr> <dbl> <dbl>
## 1 Which month has 25 mm of rain? 0.989 0.107
## 2 In which company is the global smartphone … 0.988 0.108
## 3 Do all of these planets have the same life… 0.988 0.109
## 4 In which country is the average internet s… 0.976 0.153
## 5 Which planet has the highest life expectan… 0.976 0.153
## 6 Over the course of years between 2009 and … 0.976 0.154
## 7 Do all of these planets have the same life… 0.966 0.184
## 8 What is the trend of the monthly number of… 0.964 0.187
## 9 Which season has the most rain? 0.964 0.187
## 10 Do all of these planets have the same life… 0.964 0.188
# Print 10 questions with least variability
df.wide_summary %>%
dplyr::select(question_text, mean_correct, sd_correct) %>%
arrange(desc(sd_correct)) %>%
tail(n = 10) %>%
print()## # A tibble: 10 × 3
## question_text mean_correct sd_correct
## <chr> <dbl> <dbl>
## 1 What is the trend of the monthly number of… 0.964 0.187
## 2 Do all of these planets have the same life… 0.966 0.184
## 3 Over the course of years between 2009 and … 0.976 0.154
## 4 In which country is the average internet s… 0.976 0.153
## 5 Which planet has the highest life expectan… 0.976 0.153
## 6 Does any members of species C weigh more t… 0.0233 0.152
## 7 Does cell phone brand A have more than hal… 0.0120 0.110
## 8 Do all of these planets have the same life… 0.988 0.109
## 9 In which company is the global smartphone … 0.988 0.108
## 10 Which month has 25 mm of rain? 0.989 0.107
# Print 10 questions with most variability
df.wide_summary %>%
dplyr::select(question_text, mean_correct, sd_correct) %>%
arrange(desc(sd_correct)) %>%
head(n = 10) %>%
print()## # A tibble: 10 × 3
## question_text mean_correct sd_correct
## <chr> <dbl> <dbl>
## 1 About what is the ratio of the cost of a s… 0.494 0.503
## 2 According to your best guess, what will th… 0.494 0.503
## 3 The approval rating of Republicans for the… 0.494 0.503
## 4 The residents of town Y are voting on a fa… 0.512 0.503
## 5 What is the height for a person who lies o… 0.512 0.503
## 6 What is the trend of the average price of … 0.517 0.503
## 7 How many planets spend approximately the s… 0.476 0.502
## 8 How many planets spend approximately the s… 0.476 0.502
## 9 In which city is the cost of soda the high… 0.535 0.502
## 10 What has the general average unemployment … 0.534 0.502
In the dataset we examined above, we have data from a series of tests that (supposedly) tap data viz literacy. Notably, not all items in and across these tests are the same. In any test, you want a variety of items. Based on this variety, it’s likely some items will be harder (and some will be easier) than others. I begin by performing some sanity checks that assess whether the items we would expect to be harder were indeed harder for participants (as per lower % correct).
That is, does misleading nature (categorical; yes vs. no) predict % correct?
Note: Only CALVI included misleading items. Thus, we examine data only from this test (4,082 responses from N = 425 participants).
Prediction: Misleading items are lower % correct (i.e., higher difficulty) than non-misleading items.
Method: Mixed effects logistic regression, with a fixed effect for item nature and a random intercept per participant.
# Extract relevant subset of data
df.misleading =
df.raw %>%
# Keep only relevant cols
dplyr::select(participant_id, question_text, test_name, graph_type, task_type_merged, correct_response, misleading_item) %>%
# Filter for CALVI data
filter(test_name == "CALVI") %>%
# Convert "misleading nature" into factor (True vs false)
mutate(misleading_item = factor(misleading_item))Out of the 48 items we have data on in CALVI, 36 are misleading (75%), and 12 are not (25%).
# count misleading items
df.misleading %>%
distinct(question_text, misleading_item) %>%
count(misleading_item)## misleading_item n
## 1 False 12
## 2 True 36
df.misleading %>%
group_by(misleading_item) %>%
summarise(mean = mean(correct_response, na.rm = T),
sd = sd(correct_response, na.rm = T),
n = n())## # A tibble: 2 × 4
## misleading_item mean sd n
## <fct> <dbl> <dbl> <int>
## 1 False 0.700 0.459 1022
## 2 True 0.346 0.476 3060
In line with the descriptive stats above, the plot shows that at first glance, the average % correct for misleading items seems to be lower than the average % correct for non-misleading items.
# Plot the raw data
df.misleading %>%
# misleading nature (true vs false) on x-axis
ggplot(aes(x = misleading_item,
# % correct on y-axis
y = (correct_response*100),
# color by misleading nature
fill = misleading_item)) +
# add raw data points
geom_point(position = position_jitter(width = .3,
height = 0),
shape = 21,
size = 2,
alpha = .3) +
# add bootstrapped CIs for error bars
stat_summary(fun.data = "mean_cl_boot",
geom = "pointrange",
color = "black",
shape = 21,
size = 1,
linewidth = 3) +
# adjuts scales
scale_y_continuous(limits = c(0, 100),
breaks = seq(0, 100, 25),
labels = scales::percent_format(scale = 1)) +
# add color scale for fill
scale_fill_brewer(palette = "Pastel2") +
# Adjust title and axis labels
labs(title = "% correct by misleading vs. non-misleading items",
x = "Misleading Item",
y = "% correct",
fill = "Misleading?") +
# Adjust theme
theme_classic(base_size = 36) +
theme(legend.position = "none")The logistic regression analysis shows that compared to a minimal model that fits the mean regardless of item nature (misleading vs. not misleading), the model including item nature improved fit to data:
\(χ^2\) = 356.6, df = 1, β = -1.51, SE = .080, p < .001; RMSE_model = .463 vs RMSE_baseline = 0.496.
Predicted % correct for non-misleading items was about twice as large (70%; 95% CIs = [.67, .73]) as % correct for misleading items (34%; 95% CIs = [.32, .36]).
df.misleading =
df.misleading %>%
mutate(participant_id = factor(participant_id))
# Run empty model fitting the mean
model.misleading_empty =
glm(correct_response ~
# intercept
1,
data = df.misleading)
model.misleading_empty %>%
summary()##
## Call:
## glm(formula = correct_response ~ 1, data = df.misleading)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.434346 0.007759 55.98 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2457497)
##
## Null deviance: 1002.9 on 4081 degrees of freedom
## Residual deviance: 1002.9 on 4081 degrees of freedom
## AIC: 5858.4
##
## Number of Fisher Scoring iterations: 2
# RMSE
rmse_empty =
# square root
sqrt(
# take mean
mean((df.misleading$correct_response -
predict(model.misleading_empty, type = "response"))^2
# errors squared
)
)
print(rmse_empty)## [1] 0.4956708
########################
# Run augmented model adding predictor of interest (misleading nature)
model.misleading_augmented =
glmer(correct_response ~
# intercept
1 +
# fixed effect for item nature
misleading_item +
# random intercept for participant
(1 | participant_id),
data = df.misleading,
family = "binomial")
# Print regression output
model.misleading_augmented %>%
summary()## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## correct_response ~ 1 + misleading_item + (1 | participant_id)
## Data: df.misleading
##
## AIC BIC logLik deviance df.resid
## 5195.3 5214.2 -2594.6 5189.3 4079
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.7064 -0.7271 -0.6680 1.2000 1.5747
##
## Random effects:
## Groups Name Variance Std.Dev.
## participant_id (Intercept) 0.08683 0.2947
## Number of obs: 4082, groups: participant_id, 425
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.86159 0.07082 12.17 <2e-16 ***
## misleading_itemTrue -1.51333 0.08013 -18.89 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## msldng_tmTr -0.856
## model term df1 df2 F.ratio Chisq p.value
## misleading_item 1 Inf 356.640 356.64 <.0001
## $misleading_item
## # Predicted probabilities of correct_response
##
## misleading_item | Predicted | 95% CI
## ----------------------------------------
## False | 0.70 | 0.67, 0.73
## True | 0.34 | 0.32, 0.36
##
## Adjusted for:
## * participant_id = 0 (population-level)
##
## attr(,"class")
## [1] "ggalleffects" "list"
## attr(,"model.name")
## [1] "."
## # A tibble: 1 × 7
## nobs sigma logLik AIC BIC deviance df.residual
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 4082 1 -2595. 5195. 5214. 5052. 4079
# Compute RMSE
rmse_misleading =
# square root
sqrt(
# take mean
mean((df.misleading$correct_response -
predict(model.misleading_augmented, type = "response"))^2
# errors squared
)
)
print(rmse_misleading)## [1] 0.4630246
# Compare empty model with augmented model
anova(model.misleading_empty, model.misleading_augmented)## Analysis of Deviance Table
##
## Model: gaussian, link: identity
##
## Response: correct_response
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 4081 1002.9
# plot the data with line for empty model prediction (mean)
df.misleading %>%
# x-axis is misleading nature
ggplot(aes(x = misleading_item,
# y-axis is % correct
y = (correct_response*100),
# color by misleading nature
fill = misleading_item)) +
# add raw data points
geom_point(position = position_jitter(width = .3,
height = 0),
shape = 21,
size = 2,
alpha = .3) +
# add bootstrapped CIs
stat_summary(fun.data = "mean_cl_boot",
geom = "pointrange",
color = "black",
shape = 21,
size = 1,
linewidth = 3) +
# add horizontal line for mean
geom_hline(yintercept = mean(df.misleading$correct_response)*100,
color = "lightblue", size = 3) +
# adjust scale
scale_y_continuous(limits = c(0, 100),
breaks = seq(0, 100, 25),
labels = scales::percent_format(scale = 1)) +
# specify color fill
scale_fill_brewer(palette = "Pastel2") +
# add title and axis labels
labs(title = "% correct by misleading vs. non-misleading items",
x = "Misleading Item",
y = "% correct",
fill = "Misleading?") +
# adjust theme
theme_classic(base_size = 36) +
theme(legend.position = "none")# plot the data with lines for augmented model predictions (separate fitted means for misleading and non-misleading items)
df.misleading %>%
# x-axis = misleading nature
ggplot(aes(x = misleading_item,
# y-axis = % correct
y = (correct_response*100),
# fill by misleading nature
fill = misleading_item)) +
# add raw data points
geom_point(position = position_jitter(width = .3,
height = 0),
shape = 21,
size = 2,
alpha = .3) +
# add boostrapped CIs
stat_summary(fun.data = "mean_cl_boot",
geom = "pointrange",
color = "black",
shape = 21,
size = 1,
linewidth = 3) +
# add separate fitted lines for each mean
geom_hline(yintercept = mean(df.misleading$correct_response[df.misleading$misleading_item == "True"])*100,
color = "orange", size = 3) +
geom_hline(yintercept = mean(df.misleading$correct_response[df.misleading$misleading_item == "False"])*100,
color = "lightgreen", size = 3) +
# adjust scale
scale_y_continuous(limits = c(0, 100),
breaks = seq(0, 100, 25),
labels = scales::percent_format(scale = 1)) +
# specify color fill
scale_fill_brewer(palette = "Pastel2") +
# add title and axis labels
labs(title = "% correct by misleading vs. non-misleading items",
x = "Misleading Item",
y = "% correct",
fill = "Misleading?") +
# adjust theme
theme_classic(base_size = 36) +
theme(legend.position = "none")That is, does question length (continuous) predict % correct?
Note: I examined data from all tests (15,616 responses from N = 426 participants and defined question length as the number of characters in the question.
Prediction: Longer questions are lower % correct (i.e., higher difficulty) than short questions.
Method: Mixed effects linear regression, with a fixed effect for question length and a random intercept per participant.
# Extract relevant subset of data
df.length =
df.raw %>%
# Keep only relevant cols
dplyr::select(participant_id, test_name, graph_type, task_type_merged, correct_response, question_text) %>%
# Compute question length
mutate(question_length = nchar(question_text))
# create summary df
df.length_summary =
df.length %>%
dplyr::select(question_text, correct_response) %>%
group_by(question_text) %>%
summarise(
percent_correct = mean(correct_response, na.rm = TRUE) * 100,
n = n(),
question_length = nchar(first(question_text))
) %>%
ungroup()There is some variation in question length both within and across tests, with the average question length ranging from 48 to 95 characters.
# create summary statistics for question length variable
df.length %>%
group_by(test_name) %>%
summarise(mean = mean(question_length),
sd = sd(question_length))## # A tibble: 5 × 3
## test_name mean sd
## <chr> <dbl> <dbl>
## 1 BRBF 85.6 34.9
## 2 CALVI 94.7 46.3
## 3 GGR 92.5 30.2
## 4 VLAT 77.5 26.1
## 5 WAN 47.7 16.3
The plot shows that at first glance, most questions are clustered around question length 50-150. Even within that range, % correct is highly variable, and there is not much of a visually clear trend.
# Plot raw data
df.length_summary %>%
# x-axis = question length
ggplot(aes(x = question_length,
# y-axis = % correct
y = percent_correct)) +
# plot raw data
geom_point(position = position_jitter(width = .5,
height = 0),
size = 2,
alpha = .3) +
# adjust scale
scale_y_continuous(limits = c(0, 100),
breaks = seq(0, 100, 25),
labels = scales::percent_format(scale = 1)) +
xlim(0, 375) +
# add axis labels
labs(y = "% correct",
x = "Question Length")The regression analysis shows that compared to a minimal model that fits the mean regardless of question length, the model including question length improved fit to data:
\(χ^2\) = 430, df = 1, β standardized = -.37, SE = .0001, p < .001; RMSE_model = .459 vs RMSE_baseline = 0.483.
On average, for every unit increase in question length (1 unit = 1 SD; standardized), % correct was predicted to increase by .2%.
df.length =
df.length %>%
mutate(participant_id = factor(participant_id),
question_length = scale(question_length,
center = T, scale = T))
# Run empty model fitting the mean
model.length_empty =
glm(correct_response ~
# intercept
1,
data = df.length)
model.length_empty %>%
summary()##
## Call:
## glm(formula = correct_response ~ 1, data = df.length)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.628458 0.003867 162.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2335135)
##
## Null deviance: 3646.3 on 15615 degrees of freedom
## Residual deviance: 3646.3 on 15615 degrees of freedom
## AIC: 21606
##
## Number of Fisher Scoring iterations: 2
# RMSE
rmse_empty =
# square root
sqrt(
# take mean
mean((df.length$correct_response -
predict(model.length_empty, type = "response"))^2
# errors squared
)
)
print(rmse_empty)## [1] 0.4832169
########################
# Run augmented model adding predictor of interest (question length)
model.length_augmented =
glmer(correct_response ~
# intercept
1 +
# fixed effect for question length
question_length +
# random intercept for question
(1 | participant_id),
family = "binomial",
data = df.length)
# Print regression output
model.length_augmented %>%
summary()## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## correct_response ~ 1 + question_length + (1 | participant_id)
## Data: df.length
##
## AIC BIC logLik deviance df.resid
## 19790.9 19813.9 -9892.5 19784.9 15613
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.7999 -1.0772 0.5761 0.7388 5.0870
##
## Random effects:
## Groups Name Variance Std.Dev.
## participant_id (Intercept) 0.2733 0.5227
## Number of obs: 15616, groups: participant_id, 426
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.56233 0.03077 18.28 <2e-16 ***
## question_length -0.37418 0.01948 -19.20 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## qustn_lngth -0.020
## model term df1 df2 F.ratio Chisq p.value
## question_length 1 Inf 368.825 368.825 <.0001
## # A tibble: 1 × 7
## nobs sigma logLik AIC BIC deviance df.residual
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 15616 1 -9892. 19791. 19814. 19010. 15613
# Compute RMSE
rmse_questionlength =
# square root
sqrt(
# take mean
mean((df.length$correct_response -
predict(model.length_augmented, type = "response"))^2
# errors squared
)
)
print(rmse_questionlength)## [1] 0.4586007
## Analysis of Deviance Table
##
## Model: gaussian, link: identity
##
## Response: correct_response
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 15615 3646.3
# plot empty model with mean
df.length %>%
# x-axis = question length
ggplot(aes(x = question_length,
# y-axis = % correct
y = correct_response)) +
# plot raw data points
geom_point(position = position_jitter(width = .5,
height = 0),
size = 2,
alpha = .5) +
# add fitted line for mean
geom_hline(yintercept = mean(df.length$correct_response),
color = "lightblue", size = 1) +
# adjust axis labels
labs(y = "Correct (y = 1, n = 0)",
x = "Question Length")# plot the data with a fitted line for the augmented model predictions
df.length %>%
# question length on x-axis
ggplot(aes(x = question_length,
# % correct on y-axis
y = correct_response)) +
# add raw data
geom_point(alpha = 0.6, size = 2) +
# add regression line
geom_smooth(method = "lm", se = TRUE, color = "lightblue") +
# add title and axis labels
labs(title = "% correct by question length",
x = "Question Length",
y = "Correct (y = 1, n = 0)")We have to reduce the data set used for model A to just CALVI items in order to match the trial-level data that is fed into the model.
# Create new data sets to match trial-level data fed into model
df.comparison_model =
df.raw %>%
# Keep only relevant cols
dplyr::select(participant_id, question_text, test_name, graph_type, task_type_merged, correct_response, misleading_item) %>%
# Filter for CALVI data
filter(test_name == "CALVI") %>%
# Convert "misleading nature" into factor (True vs false)
mutate(misleading_item = factor(misleading_item)) %>%
# calculate question length
mutate(question_length = nchar(question_text)) %>%
# format relevant variables
mutate(participant_id = factor(participant_id),
question_length = scale(question_length,
center = T, scale = T))
# Fit models anew
# Model A
model.comparison.misleading =
glmer(correct_response ~
# intercept
1 +
# fixed effect for question length
misleading_item +
# random intercept for question
(1 | participant_id),
family = "binomial",
data = df.comparison_model)
# Model B
model.comparison.length =
glmer(correct_response ~
# intercept
1 +
# fixed effect for question length
question_length +
# random intercept for question
(1 | participant_id),
family = "binomial",
data = df.comparison_model)
# Compare model A with model B
anova(model.comparison.misleading, model.comparison.length)## Data: df.comparison_model
## Models:
## model.comparison.misleading: correct_response ~ 1 + misleading_item + (1 | participant_id)
## model.comparison.length: correct_response ~ 1 + question_length + (1 | participant_id)
## npar AIC BIC logLik deviance Chisq
## model.comparison.misleading 3 5195.3 5214.2 -2594.6 5189.3
## model.comparison.length 3 5568.8 5587.7 -2781.4 5562.8 0
## Df Pr(>Chisq)
## model.comparison.misleading
## model.comparison.length 0
Next, we evaluate item information and item discrimination for all test items across all tests from an IRT perspective.
In IRT, item difficulty is defined as the level of latent ability (i.e., data visualization) or point on the ability scale (represented by theta) needed to have a 50% probability of getting the item right. Harder items require more ability to have a 50% probability of getting the item right. Easier items require less ability to have a 50% probability of getting the item right.
We fit two-parameter logistic (2PL) models, where the expected difference in performance (i.e., probability of getting the item right) depends not just on ability level but also the specific item in question. In other words, subjects’ responses are predicted by the (hypothesized) ability as well as item difficulty.
Item information represents each item’s ability to differentiate between test-takers based on ability level (theta). We plot item information using Item Characteristic Curves (ICCs), where the x-axis represents ability level (theta), and the y-axis represents probability of getting a correct response. Higher item information is better.
Item discrimination represents how strongly related a given item is to the latent ability in question as assessed by the broader test. Items with higher discrimination are better at differentiating test-takers. More formally, it is the slope of a tangent at the point of inflection of the item response function (IRF), or the point on the logistic curve at which the curvature changes signs.
In the plots below, we see that for many items, respondents very low on data visualization literacy still have a decent probability of choosing the correct answer. Items differ quite widely in their discrimination.
# Run for WAN
df.irt_wide =
df.irt %>%
# Filter for WAN items
filter(test_name == "WAN") %>%
dplyr::select(participant_id, item_id, correct_response) %>%
pivot_wider(names_from = item_id,
values_from = correct_response) %>%
dplyr::select(-c(participant_id))
# run 2pl model
model.2pl =
# predict by difficulty + ability
ltm(df.irt_wide ~ z1,
# print IRT parameters (coefficient estimates of item difficulty and discrimination)
IRT.param = TRUE,
# 1000 iterations to yield estimates
control = list(iter.em = 1000))
# item characteristic curves
icc_wan =
as.ggplot(~plot(model.2pl,
type = "ICC", zrange = c(-4, 4),
main = "WAN"))
icc_wanIn the plots below, we see that some items follow a step function. For some items, item difficulty is high (even with high levels of ability, respondents have a low estimated probability of selecting the correct answer. Again, items differ quite widely in their discrimination.
# Run for GGR
df.irt_wide =
df.irt %>%
# Filter for GGR items
filter(test_name == "GGR") %>%
dplyr::select(participant_id, item_id, correct_response) %>%
pivot_wider(names_from = item_id,
values_from = correct_response) %>%
dplyr::select(-c(participant_id))
# run 2pl model
model.2pl =
# predict by difficulty + ability
ltm(df.irt_wide ~ z1,
# print IRT parameters (coefficient estimates of item difficulty and discrimination)
IRT.param = TRUE,
# 1000 iterations to yield estimates
control = list(iter.em = 1000))
# item characteristic curves
icc_ggr =
as.ggplot(~plot(model.2pl,
type = "ICC", zrange = c(-4, 4),
main = "GGR"))
icc_ggrThe algorithm did not converge.
# Run for BRBF
df.irt_wide =
df.irt %>%
# filter for BRBF items
filter(test_name == "BRBF") %>%
dplyr::select(participant_id, item_id, correct_response) %>%
pivot_wider(names_from = item_id,
values_from = correct_response) %>%
dplyr::select(-c(participant_id))
# run 2pl model
model.2pl =
# predict by difficulty + ability
ltm(df.irt_wide ~ z1,
# print IRT parameters (coefficient estimates of item difficulty and discrimination)
IRT.param = TRUE,
# 1000 iterations to yield estimates
control = list(iter.em = 1000))
# item characteristic curves
icc_brbf =
as.ggplot(~plot(model.2pl,
type = "ICC", zrange = c(-4, 4),
main = "BRBF"))
icc_brbfIn the plots below, we see that for many items, respondents low on data visualization literacy still have a decent probability of choosing the correct answer. Items differ quite widely in their discrimination. For some items, item difficulty is high (even with high levels of ability, respondents have a low estimated probability of selecting the correct answer.
# Run for VLAT
df.irt_wide =
df.irt %>%
# filter for VLAT items
filter(test_name == "VLAT") %>%
dplyr::select(participant_id, item_id, correct_response) %>%
pivot_wider(names_from = item_id,
values_from = correct_response) %>%
dplyr::select(-c(participant_id))
# run 2pl model
model.2pl =
# predict by difficulty + ability
ltm(df.irt_wide ~ z1,
# print IRT parameters (coefficient estimates of item difficulty and discrimination)
IRT.param = TRUE,
# 1000 iterations to yield estimates
control = list(iter.em = 1000))
# item characteristic curves
icc_vlat =
as.ggplot(~plot(model.2pl,
type = "ICC", zrange = c(-4, 4),
main = "VLAT"))
icc_vlatIn the plots below, we see that for many items, IICs seem uninterpretable; as ability level increases, respondents have a lower probability of selecting the correct answer.
# Run for CALVI
df.irt_wide =
df.irt %>%
# filter for CALVI
filter(test_name == "CALVI") %>%
dplyr::select(participant_id, item_id, correct_response) %>%
pivot_wider(names_from = item_id,
values_from = correct_response) %>%
dplyr::select(-c(participant_id))
# run 2pl model
model.2pl =
# predict by difficulty + ability
ltm(df.irt_wide ~ z1,
# print IRT parameters (coefficient estimates of item difficulty and discrimination)
IRT.param = TRUE,
# 1000 iterations to yield estimates
control = list(iter.em = 1000))
# item characteristic curves
icc_calvi =
as.ggplot(~plot(model.2pl,
type = "ICC", zrange = c(-4, 4),
main = "CALVI"))
icc_calvi# plot all iccs together
icc_plots =
list(
icc_wan,
icc_ggr,
icc_brbf,
icc_vlat,
icc_calvi
)
icc_plot =
icc_wan +
icc_ggr +
icc_brbf +
icc_vlat +
icc_calvi +
plot_layout(ncol = 5) +
plot_annotation(
title = "ICCs by Test",
theme = theme(plot.title = element_text(hjust = 0.5))
)
print(icc_plot)The item discrimination and item information analyses suggest that even students with relatively low data visualization literacy have a non-negligible chance of getting the correct response on various items.
Next, we run a model representing a student with low data visualization literacy who “guesses” the answer to a given item in a semi-informed manner (e.g., using an exclusion strategy or similar).
# define dataset
df.irt_wide =
df.irt %>%
# keep only items and response values
dplyr::select(participant_id, item_id, correct_response) %>%
# create matrix containing only 1s and 0s for correct/incorrect responses
pivot_wider(names_from = item_id,
values_from = correct_response) %>%
# remove participant id
dplyr::select(-c(participant_id))# create df with column representing binary vs MC nature of each item
df.binaryvsmc =
df.irt %>%
dplyr::select(item_id, possible_responses, correct_response) %>%
distinct() %>%
# count the response options for each item
mutate(n_options = str_count(possible_responses, ",") + 1,
item_type = if_else(n_options == 2, "binary", "mc"))
# obtain lists of item ids for binary and mc items
# binary items
binary_items =
df.binaryvsmc %>%
filter(item_type == "binary") %>%
pull(item_id)
# mc items
mc_items =
df.binaryvsmc %>%
filter(item_type == "mc") %>%
pull(item_id)For binary items, we use a 2PL model where we set the latent ability (or theta) to ~1 SD below the average to simulate a low-literacy test-taker.
For any item whose difficulty is around –1, the 2-PL yields about a coin-flip chance of obtaining the correct response.
\[P(\text{correct}\mid\theta=-1) \; =\; \text{logit}^{-1}\!\bigl(a(\theta-b)\bigr) \approx 0.50.\]
For easier items (difficulty < –1) the model lets the learner do better than chance, and for harder items (b > –1) it lets them do worse.
# fit 2pl model to generate predictions for binary items
model.2pl =
# predict by difficulty + ability
ltm(df.irt_wide ~ z1,
# print IRT parameters (coefficient estimates of item difficulty and discrimination)
IRT.param = TRUE,
# 1000 iterations to yield estimates
control = list(iter.em = 1000))
# pass coefficients from model as parameters
pars = coef(model.2pl)
# define logistic function, where a = discrimination and b = difficulty
logistic =
function(theta, a, b) 1 / (1 + exp(-a * (theta - b)))
# define data viz literacy as 1 SD below the mean (theta = -1)
theta = -1
# obtain discrimination and difficulty for binary items
P_binary =
logistic(theta,
pars[binary_items, "Dscrmn"],
pars[binary_items, "Dffclt"])
# generate predictions for each item
pred_binary =
tibble(
item_id = names(P_binary),
proportion_correct = round(100 * P_binary, 2)
)For multiple-choice items, we assume that the student follows an exclusion strategy: They can rule out a non-negligible number of options at a coin-flip rate, and then guess among the remaining options.
# predictions for MC items
# define parameters
theta = .5 # chance the simulated student eliminates a given response option
elim_prop = .75 # student eliminates max 75 % of wrong options
n_sim = 426 # we simulate N = 426 respondents to match our data
# define function to get prediction for each MC item
get_mc_pred =
function(opts_string, n_sim, theta, elim_prop) {
# count the number of response options for each item
n_options = str_count(opts_string, ",") + 1
# define the number of wrong options as number of options minus 1
k_wrong = n_options - 1
if (k_wrong == 0) return(1)
# set a dynamic maximum of eliminated options (≤ 75% of options)
max_elim = floor(elim_prop * k_wrong)
# set chance the student successfully eliminates any single wrong option based on parameters above
n_elim = pmin(rbinom(n_sim, k_wrong, theta), max_elim)
# define the remaining wrong options as the number of wrong options minus eliminated options
remain_wrong = k_wrong - n_elim
# calculate expected probability of selecting correct response
mean(1 / (remain_wrong + 1))
}
# generate predictions for each mc item
pred_mc =
df.binaryvsmc %>%
# filter for mc items
filter(item_type == "mc") %>%
distinct(item_id, possible_responses) %>%
mutate(
proportion_correct = round(
100 * map_dbl(
possible_responses,
# run prediction function using three parameters
~ get_mc_pred(.x,
n_sim = n_sim,
theta = theta,
elim_prop = elim_prop)
), 2) # round to two digits
) %>%
# keep only relevant cols
dplyr::select(item_id, proportion_correct)# merge datasets
prediction_final =
bind_rows(
pred_binary %>%
mutate(item_id = as.numeric(item_id)),
pred_mc %>%
mutate(item_id = as.numeric(item_id))
) %>%
arrange(item_id)# create dataset with observed % correct values per item
observed_final =
df.irt %>%
group_by(item_id) %>%
summarise(actual_prop = 100 * mean(correct_response, na.rm = TRUE),
test_name = first(test_name),
.groups = "drop") %>%
mutate(item_id = as.numeric(item_id))
# create new dataset merging observed proportions with predicted proportions
results =
prediction_final %>%
rename(predicted_prop = proportion_correct) %>%
left_join(observed_final, by = "item_id")correlation = cor.test(results$predicted_prop, results$actual_prop)
# plot
results %>%
ggplot(aes(x = predicted_prop,
y = actual_prop,
color = test_name)) +
geom_point(alpha = 0.5, size = 2) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed",
color = "darkblue", size = 1) +
scale_color_brewer(palette = "Set1", name = "Test") +
annotate("text", x = 100, y = 5,
label = paste0("r = ", round(correlation$estimate, 3)),
hjust = 1, vjust = 0,
size = 7, fontface = "bold") +
labs(
title = "Predicted vs. Observed Item Difficulty (measured as % correct)",
subtitle = "Individual dots represent individual test items",
x = "Predicted % correct ('low-literacy guessing student' model)",
y = "Observed % correct (N = 426 subjects)"
)The correlation analysis shows that the model predictions are significantly but only weakly correlated with the observed data. The RMSE evaluation shows that the low-literacy guessing model performs worse than a baseline model that fits just the mean. In other words, the guessing model is not a good explanation for the data at hand; it is highly unlikely that high-performing respondents achieved a large number of correct responses by simply doing (informed) guessing.
r(200) = .22, 95% CIs = [.09, .35], p = .002
RMSE_model = 35.17 vs RMSE_baseline = 27.22 (scale: whole percentage points, 0-100)
# Simple correlation
correlation = cor.test(results$predicted_prop, results$actual_prop)
print(correlation)##
## Pearson's product-moment correlation
##
## data: results$predicted_prop and results$actual_prop
## t = 3.1997, df = 200, p-value = 0.0016
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.08522276 0.34812299
## sample estimates:
## cor
## 0.2206775
# Calculate RMSE
# RMSE of fitting the mean
baseline_rmse =
sqrt(mean((results$actual_prop - mean(results$actual_prop))^2))
print(baseline_rmse)## [1] 27.21975
# RMSE of model
rmse =
sqrt(mean((results$predicted_prop - results$actual_prop)^2,
na.rm = TRUE))
print(rmse)## [1] 35.17515
## Baseline RMSE = 27.2 %
## Model RMSE = 35.2 %
We have a held-out 20% of data that contains entirely new items. We now have the models above make predictions for that held-out dataset (predicting proportion correct for these new items).
# Import held out data with participant info
df.heldout =
read.csv(paste0(data_dir, "test_data.csv"))
# add column with binary vs MC information
df.heldout_binaryvsmc =
df.heldout %>%
dplyr::select(item_id, possible_responses) %>%
distinct() %>%
# count the response options for each item
mutate(n_options = str_count(possible_responses, ",") + 1,
item_type = if_else(n_options == 2, "binary", "mc"))
# pull IDs for binary items
binary_items =
df.heldout_binaryvsmc %>%
filter(item_type == "binary") %>%
pull(item_id)
# pull IDs for mc items
mc_items =
df.heldout_binaryvsmc %>%
filter(item_type == "mc") %>%
pull(item_id)# run predictions for MC items
# define parameters
theta = .5 # chance the simulated student eliminates a given response option
elim_prop = .75 # student eliminates max 75 % of wrong options
n_sim = 426 # we simulate N = 426 respondents to match our data
# define function to get prediction for each MC item
get_mc_pred =
function(opts_string, n_sim, theta, elim_prop) {
# count the number of response options for each item
n_options = str_count(opts_string, ",") + 1
# define the number of wrong options as number of options minus 1
k_wrong = n_options - 1
if (k_wrong == 0) return(1)
# set a dynamic maximum of eliminated options (≤ 75% of options)
max_elim = floor(elim_prop * k_wrong)
# set chance the student successfully eliminates any single wrong option based on parameters above
n_elim = pmin(rbinom(n_sim, k_wrong, theta), max_elim)
# define the remaining wrong options as the number of wrong options minus eliminated options
remain_wrong = k_wrong - n_elim
# calculate expected probability of dplyr::selecting correct response
mean(1 / (remain_wrong + 1))
}
# generate predictions for mc items
pred_mc_heldout =
df.heldout_binaryvsmc %>%
filter(item_type == "mc") %>%
distinct(item_id, possible_responses) %>%
mutate(
predicted_prop = round(
100 * map_dbl(
possible_responses,
# run prediction function using three parameters
~ get_mc_pred(.x,
n_sim = n_sim,
theta = theta,
elim_prop = elim_prop)
), 2) # round to two digits
) %>%
# keep only relevant cols
dplyr::select(item_id, predicted_prop)results_heldout =
df.heldout %>%
dplyr::select(item_id, proportion_correct, test_name) %>%
rename(actual_prop = proportion_correct) %>%
left_join(export) %>%
rename(predicted_prop = proportion_correct)
# plot
results_heldout %>%
ggplot(aes(x = (predicted_prop*100),
y = (actual_prop*100),
color = test_name)) +
geom_point(alpha = 0.5, size = 2) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed",
color = "darkblue", size = 1) +
scale_color_brewer(palette = "Set1", name = "Test") +
ylim(0, 100) +
xlim(0, 100) +
labs(
title = "HELD-OUT DATA: Predicted vs. Observed Item Difficulty (measured as % correct)",
subtitle = "Individual dots represent individual test items",
x = "Predicted % correct ('low-literacy guessing student' model)",
y = "Observed % correct (N = 426 subjects)"
)